/*

****************************************************************************

Summary:

This file creates address variables for all mid- and high-rise public housing buildings in Chicago.

Note that this file cleans the address strings in the social assistance case file
to make sure they can be merged on to a list of public housing buildings.

The issue here is that there are various ways to write down an address: "36TH" versus
"36th" street.

This file proceeds by

1) opening the social assistance addresses and cleaning the addresses manually,
2) merging cleaned addresses with the public housing building list and saving a cross-walk 

The final output is a cross-walk that connects 

(A) the raw/original case address strings to
(B) the public housing building info for that address if it is a PH building 

For example, for building A, there may be two address string IDHS records: "1100 36TH ST"
and "1100 E. 36TH". Both of these should be matched to the same PH-building (with info
about which building) and the # of units.
 
****************************************************************************

*/

clear
cap log close
set more off

global demo      ""
 // directory for storing demolition related files
global raw       ""
 // raw directory for social assistance case files
global xwalk     ""
 // directory for geocoded address assistance data file

********************************************************************************
*** STEP 1. 
*** Open the raw social assistance address data and create rough address variables
*** manually. Note that the PH building file has addresses split into four fields
*** (strnum, strdir, strtype, strname). The code below will manually search the 
*** raw address for PH-relevant address components. 
********************************************************************************

gzuse "${raw}caseaddr_200606_201212.dta.gz", clear

!gunzip "${xwalk}xwalk_addresses_pre_geo_spell_full.dta.gz" // this file is some "interemdiate" address file that has some more refined address fields for the raw data
merge n:1 caseaddr casecity casezip using "${xwalk}xwalk_addresses_pre_geo_spell_full.dta", keep(1 3) keepusing(address state city zip)
!gzip "${xwalk}xwalk_addresses_pre_geo_spell_full.dta" 

* filter
keep if _merge==3
keep if state=="IL"
keep if city=="CHICAGO"

drop _merge

** Step 1.1 Create variables related to street address (these will be used to merge with public housing files)

* street number
gen strnum = ""

* create a street number (up to four digits)
gen temp1 = regexs(0) if(regexm(address, "[0-9]"))
gen temp2 = regexs(0) if(regexm(address, "[0-9][0-9]"))
gen temp3 = regexs(0) if(regexm(address, "[0-9][0-9][0-9]"))
gen temp4 = regexs(0) if(regexm(address, "[0-9][0-9][0-9][0-9]"))

replace strnum = temp1
replace strnum = temp2 if (strnum=="" | temp2~="")
replace strnum = temp3 if (strnum=="" | temp3~="")
replace strnum = temp4 if (strnum=="" | temp4~="")

drop temp*

* street direction
gen strdir = ""

replace strdir = "N" if regexm(address, "NORTH")
replace strdir = "S" if regexm(address, "SOUTH")

* careful with East and West b/c we could have "North West" or "South West")
replace strdir = "E" if regexm(address, "EAST") & strdir==""
replace strdir = "W" if regexm(address, "WEST") & strdir==""

* street type
gen strtype = ""
 
replace strtype = "AVE" if regexm(address, "AVENUE") 
replace strtype = "AVE" if regexm(address, "AVE ") // 4/1/2014 addition
replace strtype = "AVE" if regexm(address, "AVE.") // 4/1/2014 addition
replace strtype = "BLVD" if regexm(address, "BOULEVARD") 
replace strtype = "PL" if regexm(address, "PLACE") 
*replace strtype = "PL" if regexm(address, "PLAZ")
replace strtype = "ST" if regexm(address, "STREET")
replace strtype = "ST" if regexm(address, "STR.") // 4/1/2014 edit
replace strtype = "ST" if regexm(address, "ST.") // 4/1/2014 addition

* street names 
gen strname = ""

replace strname = "36TH" if regexm(address, "36th")  
replace strname = "36TH" if regexm(address, "36TH") 
replace strname = "36TH" if regexm(address, "36 TH ")
replace strname = "36TH" if regexm(address, " 36 ") & strname==""
replace strname = "36TH" if regexm(address, "THIRTYSIX")
replace strname = "36TH" if regexm(address, "THIRTY SIX")
replace strname = "36TH" if regexm(address, "THIRTY SIXTH")
replace strname = "36TH" if regexm(address, "THIRTY-SIXTH")
replace strname = "36TH" if regexm(address, "THIRTYSIXTH")

replace strname = "37TH" if regexm(address, "37th")  
replace strname = "37TH" if regexm(address, "37TH") 
replace strname = "37TH" if regexm(address, "37 TH ")
replace strname = "37TH" if regexm(address, " 37 ") & strname==""
replace strname = "37TH" if regexm(address, "THIRTYSEVEN")
replace strname = "37TH" if regexm(address, "THIRTY SEVEN")
replace strname = "37TH" if regexm(address, "THIRTY SEVENTH")
replace strname = "37TH" if regexm(address, "THIRTY-SEVENTH")
replace strname = "37TH" if regexm(address, "THIRTYSEVENTH")

replace strname = "63RD" if regexm(address, "63rd")  
replace strname = "63RD" if regexm(address, "63RD") 
replace strname = "63RD" if regexm(address, "63 RD ") 
replace strname = "63RD" if regexm(address, " 63 ") & strname==""
replace strname = "63RD" if regexm(address, "SIXTYTHREE")
replace strname = "63RD" if regexm(address, "SIXTY THREE")
replace strname = "63RD" if regexm(address, "SIXTY THIRD")
replace strname = "63RD" if regexm(address, "SIXTY-THIRD")
replace strname = "63RD" if regexm(address, "SIXTYTHIRD")

replace strname = "ADAMS" if regexm(address, "ADAMS")
replace strname = "BOWEN" if regexm(address, "BOWEN")
replace strname = "BROWNING" if regexm(address, "BROWNING")
replace strname = "CALUMET" if regexm(address, "CALUMET")
replace strname = "COTTAGE GROVE" if regexm(address, "COTTAGE GROVE")
replace strname = "ELLIS" if regexm(address, "ELLIS")
replace strname = "EVANS" if regexm(address, "EVANS")
replace strname = "FEDERAL" if regexm(address, "FEDERAL") 
replace strname = "HERMITAGE" if regexm(address, "HERMITAGE") 
replace strname = "HOYNE" if regexm(address, "HOYNE")
replace strname = "JACKSON" if regexm(address, "JACKSON") 
replace strname = "LAKE" if regexm(address, "LAKE")
replace strname = "MONROE" if regexm(address, "MONROE")
replace strname = "PRAIRIE" if regexm(address, "PRAIRIE")
replace strname = "STATE" if regexm(address, "STATE")
replace strname = "WABASH" if regexm(address, "WABASH")
replace strname = "WASHINGTON" if regexm(address, "WASHINGTON")
replace strname = "WOLCOTT" if regexm(address, "WOLCOTT")
replace strname = "WOOD" if regexm(address, "WOOD")
replace strname = "" if regexm(address, "LOCKWOOD") // 4/5/2014 -- this shouldn't be one of the PH addresses

*** MANUAL EDITS 

 /* Here, I need to work with some of the addresses manually because street
    type is often not available. So, if "strtype" is missing then, I can't 
	get a merge.
	
	below, I replace the "type" after I have looked up the street on Google maps to verify there are no other "types" for this street name, i.e. "ELLIS DR." VERSUS "ELLIS STREET"
 
 */

replace strtype = "AVE" if strdir=="N" & strname=="HOYNE" & strtype=="" 
replace strtype = "AVE" if strdir=="S" & strname=="ELLIS" & strtype=="" 
replace strtype = "AVE" if strdir=="E" & strname=="BOWEN" & strtype==""
replace strtype = "AVE" if strdir=="E" & strname=="BROWNING" & strtype==""        
replace strtype = "BLVD" if strdir=="W" & strname=="JACKSON" & strtype==""
replace strtype = "ST" if strdir=="W" & strname=="MONROE" & strtype==""
replace strtype = "ST" if strdir=="W" & strname=="ADAMS" & strtype==""
replace strtype = "ST" if strdir=="S" & strname=="FEDERAL" & strtype==""
replace strtype = "ST" if strdir=="S" & strname=="STATE" & strtype==""
replace strtype = "ST" if strdir=="W" & strname=="LAKE" & strtype==""
 
replace strtype = "ST" if strdir=="N" & strname=="WOOD" & strtype=="" 
replace strtype = "PL" if strdir=="E" & strname=="36TH" & strtype=="" // this could be street or place, but they are both part of the same demo group
replace strtype = "ST" if strdir=="E" & strname=="63RD" & strtype==""
replace strtype = "AVE" if strdir=="S" & strname=="CALUMET" & strtype=="" 
replace strtype = "AVE" if strdir=="S" & strname=="COTTAGE GROVE" & strtype==""
replace strtype = "AVE" if strdir=="S" & strname=="EVANS" & strtype==""
replace strtype = "AVE" if strdir=="S" & strname=="PRAIRIE" & strtype==""
replace strtype = "AVE" if strdir=="N" & strname=="HERMITAGE" & strtype==""
replace strtype = "AVE" if strdir=="S" & strname=="WABASH" & strtype==""

* Remaining "TH" streets not captured above
foreach num of numlist 12 130 13 14 15 19 24 28 38 39 57 65 67 69 {

replace strname = "`num'TH" if regexm(address, "`num'th")  
replace strname = "`num'TH" if regexm(address, "`num'TH") 
replace strname = "`num'TH" if regexm(address, "`num' TH ")
replace strname = "`num'TH" if regexm(address, " `num' ") & strname==""

}

* ND
foreach num of numlist 132 32 42 {

replace strname = "`num'ND" if regexm(address, "`num'nd")  
replace strname = "`num'ND" if regexm(address, "`num'ND") 
replace strname = "`num'ND" if regexm(address, "`num' ND ")
replace strname = "`num'ND" if regexm(address, " `num' ") & strname==""

}

* ST

foreach num of numlist 21 41 {

replace strname = "`num'ST" if regexm(address, "`num'st")  
replace strname = "`num'ST" if regexm(address, "`num'ST") 
replace strname = "`num'ST" if regexm(address, "`num' ST ")
replace strname = "`num'ST" if regexm(address, " `num' ") & strname==""

}

* RD

foreach num of numlist 43 63 {

replace strname = "`num'RD" if regexm(address, "`num'rd")  
replace strname = "`num'RD" if regexm(address, "`num'RD") 
replace strname = "`num'RD" if regexm(address, "`num' RD ")
replace strname = "`num'RD" if regexm(address, " `num' ") & strname==""

}

*** Obtain remaining address names
preserve

use "${demo}chabuildinglist.dta", clear

* drop all ordinal street names
drop if regexm(strname, "ST")
drop if regexm(strname, "ND")
drop if regexm(strname, "RD")
drop if regexm(strname, "TH")

* store as local remaining street names
levelsof(strname) if bunits>40, local(temp)
 /* here, I'm only focusing on the larger buildings (note high-rise is typically defined as a building with 75 or more units */

restore

* loop over local created above which stored PH-street names
foreach name of local temp {
 display "`name'"
 replace strname = "`name'" if regexm(address, "`name'") & strname==""
}

********************************************************************************
*** STEP 2. 
*** Now, merge the raw addresses using the newly created strnum/strtype/strdir/strname
*** variables with the list of public housing buildings (original file). Only buildings
*** with 40 or more units should be found.
********************************************************************************

merge n:1 strnum strname strdir strtype using "${demo}chabuildinglist.dta", keep(1 3) keepusing(bdg bunits prjname bdgtype)
 
* keep the raw address fields for large PH buildings (mid and high-rises)
keep if _merge==3
keep if bunits>=40

keep address city state zip prjname bdgtype bunits bdg

* flatten
duplicates drop

compress
gzsave "${demo}xwalk_case_addr_PH_buildings.dta.gz", replace


